source('utils.R')
#devtools::install_github("ujjwalkarn/xda")
library(knitr)
library(ggjoy)
#devtools::install_github("vsimko/corrplot")
library(corrplot)
# First run feature-engineering chunk from preprocessing.Rmd
# I need to detach and load dplyr again to avoid collisions between packages functions...
detach("package:dplyr", character.only = TRUE)
library("dplyr", character.only = TRUE)
df <- load_data()
df <- introduce_nas(df, 22.5, 'pH')
df <- df %>%
mutate(`other sulfur dioxide` = `total sulfur dioxide` - `free sulfur dioxide`) %>%
select(-`total sulfur dioxide`)
https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names
Input variables (based on physicochemical tests): 1 - fixed acidity
2 - volatile acidity
3 - citric acid
4 - residual sugar
5 - chlorides
6 - free sulfur dioxide
7 - total sulfur dioxide -> Removed
8 - other sulfur dioxide -> Created from ‘total sulfur dioxide’ - ‘free sulfur dioxide’
9 - density
10 - pH
11 - sulphates
12 - alcohol
13 - wine_colour
Output variable (based on sensory data):
14 - quality (score between 0 and 10)
The two datasets are related to red and white variants of the Portuguese “Vinho Verde” wine. For more details, consult: http://www.vinhoverde.pt/en/ or the reference [Cortez et al., 2009]. Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).
These datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are munch more normal wines than excellent or poor ones). Outlier detection algorithms could be used to detect the few excellent or poor wines. Also, we are not sure if all input variables are relevant. So it could be interesting to test feature selection methods.
Number of Instances: red wine - 1599; white wine - 4898.
Number of Attributes: 12 + output attribute
Note: several of the attributes may be correlated, thus it makes sense to apply some sort of feature selection.
xda::numSummary(df)
## n mean sd max min range nunique
## fixed acidity 6497 7.215 1.296 15.900 3.800 12.1000 106
## volatile acidity 6497 0.340 0.165 1.580 0.080 1.5000 187
## citric acid 6497 0.319 0.145 1.660 0.000 1.6600 89
## residual sugar 6497 5.443 4.758 65.800 0.600 65.2000 316
## chlorides 6497 0.056 0.035 0.611 0.009 0.6020 214
## free sulfur dioxide 6497 30.525 17.749 289.000 1.000 288.0000 135
## density 6497 0.995 0.003 1.039 0.987 0.0519 998
## pH 5036 3.218 0.160 4.010 2.720 1.2900 106
## sulphates 6497 0.531 0.149 2.000 0.220 1.7800 111
## alcohol 6497 10.492 1.193 14.900 8.000 6.9000 111
## quality 6497 5.818 0.873 9.000 3.000 6.0000 7
## other sulfur dioxide 6495 85.237 45.418 331.000 3.000 328.0000 251
## nzeros iqr lowerbound upperbound noutlier
## fixed acidity 0 1.30000 4.4500 9.650 357
## volatile acidity 0 0.17000 -0.0250 0.655 377
## citric acid 151 0.14000 0.0400 0.600 509
## residual sugar 0 6.30000 -7.6500 17.550 118
## chlorides 0 0.02700 -0.0025 0.106 286
## free sulfur dioxide 0 24.00000 -19.0000 77.000 62
## density 0 0.00465 0.9854 1.004 3
## pH 0 0.21000 2.7950 3.635 54
## sulphates 0 0.17000 0.1750 0.855 191
## alcohol 0 1.80000 6.8000 14.000 3
## quality 0 1.00000 3.5000 7.500 228
## other sulfur dioxide 0 61.00000 -36.5000 207.500 20
## kurtosis skewness mode miss miss% 1% 5%
## fixed acidity 5.054 1.722 6.800 0 0.0000 5.100 5.700
## volatile acidity 2.820 1.494 0.280 0 0.0000 0.120 0.160
## citric acid 2.393 0.472 0.300 0 0.0000 0.000 0.050
## residual sugar 4.353 1.435 2.000 0 0.0000 0.900 1.200
## chlorides 50.841 5.397 0.044 0 0.0000 0.021 0.028
## free sulfur dioxide 7.896 1.220 29.000 0 0.0000 4.000 6.000
## density 6.597 0.503 0.997 0 0.0000 0.989 0.990
## pH 0.429 0.386 NA 1461 22.4873 2.890 2.970
## sulphates 8.643 1.796 0.500 0 0.0000 0.300 0.350
## alcohol -0.533 0.565 9.500 0 0.0000 8.700 9.000
## quality 0.230 0.190 6.000 0 0.0000 4.000 5.000
## other sulfur dioxide -0.322 0.101 101.000 2 0.0308 6.000 10.000
## 25% 50% 75% 95% 99%
## fixed acidity 6.400 7.000 7.700 9.800 12.000
## volatile acidity 0.230 0.290 0.400 0.670 0.880
## citric acid 0.250 0.310 0.390 0.560 0.740
## residual sugar 1.800 3.000 8.100 15.000 18.200
## chlorides 0.038 0.047 0.065 0.102 0.186
## free sulfur dioxide 17.000 29.000 41.000 61.000 77.000
## density 0.992 0.995 0.997 0.999 1.001
## pH 3.110 3.210 3.320 3.500 3.630
## sulphates 0.430 0.510 0.600 0.790 0.990
## alcohol 9.500 10.300 11.300 12.700 13.400
## quality 5.000 6.000 6.000 7.000 8.000
## other sulfur dioxide 55.000 86.000 116.000 159.000 189.000
xda::charSummary(df)
## n miss miss% unique top5levels:count
## wine_colour 6497 0 0 2 white:4898, red:1599
In this update I introduce the use of bayesian blocks. In a few words: histograms with dynamic binning so you don’t have to choose the binning value (which can make the histograms confusing). The dynamic binning values are calculated by optimizing a fitness function, so maybe you should choose a fitness function that suits your problem well. Here we’re gonna use the method’s default fitness function: “SIC”.
plot_bay_blocks(df)
plot_histograms(df)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 1461 rows containing non-finite values (stat_bin).
plot_boxplots(df)
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1461 rows containing non-finite values (stat_boxplot).
plot_joyplots(df)
## Picking joint bandwidth of 0.223
## Picking joint bandwidth of 0.0252
## Picking joint bandwidth of 0.0274
## Picking joint bandwidth of 0.471
## Picking joint bandwidth of 0.0024
## Picking joint bandwidth of 2.47
## Picking joint bandwidth of 4.93
## Warning: Removed 2 rows containing non-finite values (stat_density_ridges).
## Picking joint bandwidth of 0.000418
## Picking joint bandwidth of 0.0283
## Warning: Removed 1461 rows containing non-finite values
## (stat_density_ridges).
## Picking joint bandwidth of 0.0224
## Picking joint bandwidth of 0.211
## Picking joint bandwidth of 0.138
other_plots(df)
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1461 rows containing non-finite values (stat_boxplot).
boxplots_quality_colour_wine(df, 'red')
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 356 rows containing non-finite values (stat_boxplot).
boxplots_quality_colour_wine(df, 'white')
## Warning: Removed 1105 rows containing non-finite values (stat_boxplot).
corrs <- cor(drop_na(df) %>% select(-wine_colour))
corrs
## fixed acidity volatile acidity citric acid
## fixed acidity 1.0000 0.2120 0.32161
## volatile acidity 0.2120 1.0000 -0.38134
## citric acid 0.3216 -0.3813 1.00000
## residual sugar -0.1160 -0.1932 0.14439
## chlorides 0.3036 0.3771 0.03840
## free sulfur dioxide -0.2770 -0.3465 0.13108
## density 0.4581 0.2744 0.09401
## pH -0.2467 0.2755 -0.33595
## sulphates 0.3079 0.2366 0.06149
## alcohol -0.1005 -0.0417 -0.00723
## quality -0.0737 -0.2637 0.08563
## other sulfur dioxide -0.2980 -0.3777 0.18942
## residual sugar chlorides free sulfur dioxide density
## fixed acidity -0.1160 0.3036 -0.2770 0.45814
## volatile acidity -0.1932 0.3771 -0.3465 0.27443
## citric acid 0.1444 0.0384 0.1311 0.09401
## residual sugar 1.0000 -0.1288 0.3991 0.55213
## chlorides -0.1288 1.0000 -0.1983 0.35990
## free sulfur dioxide 0.3991 -0.1983 1.0000 0.02621
## density 0.5521 0.3599 0.0262 1.00000
## pH -0.2780 0.0419 -0.1495 0.00664
## sulphates -0.1869 0.4250 -0.1907 0.26528
## alcohol -0.3598 -0.2549 -0.1825 -0.68454
## quality -0.0222 -0.1935 0.0524 -0.29406
## other sulfur dioxide 0.4622 -0.2720 0.5054 0.02938
## pH sulphates alcohol quality
## fixed acidity -0.24669 3.08e-01 -1.00e-01 -0.0737
## volatile acidity 0.27548 2.37e-01 -4.17e-02 -0.2637
## citric acid -0.33595 6.15e-02 -7.23e-03 0.0856
## residual sugar -0.27800 -1.87e-01 -3.60e-01 -0.0222
## chlorides 0.04188 4.25e-01 -2.55e-01 -0.1935
## free sulfur dioxide -0.14947 -1.91e-01 -1.83e-01 0.0524
## density 0.00664 2.65e-01 -6.85e-01 -0.2941
## pH 1.00000 1.93e-01 1.30e-01 0.0196
## sulphates 0.19266 1.00e+00 -9.57e-05 0.0394
## alcohol 0.13023 -9.57e-05 1.00e+00 0.4440
## quality 0.01960 3.94e-02 4.44e-01 1.0000
## other sulfur dioxide -0.24273 -2.79e-01 -2.61e-01 -0.0732
## other sulfur dioxide
## fixed acidity -0.2980
## volatile acidity -0.3777
## citric acid 0.1894
## residual sugar 0.4622
## chlorides -0.2720
## free sulfur dioxide 0.5054
## density 0.0294
## pH -0.2427
## sulphates -0.2791
## alcohol -0.2605
## quality -0.0732
## other sulfur dioxide 1.0000
corrplot.mixed(corrs, upper="ellipse", lower="number")
corrs_red <- cor(drop_na(df) %>% filter(wine_colour == 'red') %>% select(-wine_colour))
corrs_red
## fixed acidity volatile acidity citric acid
## fixed acidity 1.0000 -0.26350 0.6745
## volatile acidity -0.2635 1.00000 -0.5520
## citric acid 0.6745 -0.55197 1.0000
## residual sugar 0.0942 -0.00669 0.1618
## chlorides 0.0963 0.07086 0.2137
## free sulfur dioxide -0.1416 -0.01440 -0.0536
## density 0.6653 0.02531 0.3652
## pH -0.6855 0.22781 -0.5498
## sulphates 0.2018 -0.26633 0.3269
## alcohol -0.0584 -0.22483 0.1132
## quality 0.1315 -0.39789 0.2333
## other sulfur dioxide -0.0823 0.11796 0.0604
## residual sugar chlorides free sulfur dioxide density
## fixed acidity 0.09425 0.09627 -0.14156 0.66531
## volatile acidity -0.00669 0.07086 -0.01440 0.02531
## citric acid 0.16182 0.21369 -0.05362 0.36522
## residual sugar 1.00000 0.01745 0.16937 0.32322
## chlorides 0.01745 1.00000 -0.00788 0.18804
## free sulfur dioxide 0.16937 -0.00788 1.00000 -0.00389
## density 0.32322 0.18804 -0.00389 1.00000
## pH -0.08336 -0.28535 0.08404 -0.34974
## sulphates 0.00647 0.40972 0.07475 0.16725
## alcohol 0.05773 -0.23207 -0.06806 -0.49667
## quality 0.00595 -0.14218 -0.06262 -0.18176
## other sulfur dioxide 0.22668 0.05597 0.42645 0.11106
## pH sulphates alcohol quality
## fixed acidity -0.6855 0.20181 -0.0584 0.13146
## volatile acidity 0.2278 -0.26633 -0.2248 -0.39789
## citric acid -0.5498 0.32693 0.1132 0.23326
## residual sugar -0.0834 0.00647 0.0577 0.00595
## chlorides -0.2854 0.40972 -0.2321 -0.14218
## free sulfur dioxide 0.0840 0.07475 -0.0681 -0.06262
## density -0.3497 0.16725 -0.4967 -0.18176
## pH 1.0000 -0.21332 0.2142 -0.06355
## sulphates -0.2133 1.00000 0.0875 0.24207
## alcohol 0.2142 0.08753 1.0000 0.48194
## quality -0.0635 0.24207 0.4819 1.00000
## other sulfur dioxide -0.0901 0.01118 -0.2227 -0.21432
## other sulfur dioxide
## fixed acidity -0.0823
## volatile acidity 0.1180
## citric acid 0.0604
## residual sugar 0.2267
## chlorides 0.0560
## free sulfur dioxide 0.4264
## density 0.1111
## pH -0.0901
## sulphates 0.0112
## alcohol -0.2227
## quality -0.2143
## other sulfur dioxide 1.0000
corrplot.mixed(corrs_red, upper="ellipse", lower="number")
corrs_white <- cor(drop_na(df) %>% filter(wine_colour == 'white') %>% select(-wine_colour))
corrs_white
## fixed acidity volatile acidity citric acid
## fixed acidity 1.0000 -0.03507 0.2752
## volatile acidity -0.0351 1.00000 -0.1622
## citric acid 0.2752 -0.16220 1.0000
## residual sugar 0.0829 0.06798 0.0971
## chlorides 0.0347 0.06093 0.0930
## free sulfur dioxide -0.0482 -0.08849 0.0909
## density 0.2654 0.02722 0.1471
## pH -0.4158 -0.00793 -0.1671
## sulphates -0.0308 -0.02798 0.0668
## alcohol -0.1285 0.07614 -0.0726
## quality -0.1148 -0.18847 -0.0120
## other sulfur dioxide 0.1403 0.14884 0.1066
## residual sugar chlorides free sulfur dioxide density
## fixed acidity 0.0829 0.0347 -0.04824 0.2654
## volatile acidity 0.0680 0.0609 -0.08849 0.0272
## citric acid 0.0971 0.0930 0.09094 0.1471
## residual sugar 1.0000 0.0993 0.29635 0.8406
## chlorides 0.0993 1.0000 0.10000 0.2663
## free sulfur dioxide 0.2964 0.1000 1.00000 0.2912
## density 0.8406 0.2663 0.29119 1.0000
## pH -0.2077 -0.0854 -0.00541 -0.1063
## sulphates -0.0222 0.0283 0.05818 0.0672
## alcohol -0.4525 -0.3611 -0.25461 -0.7759
## quality -0.0775 -0.1966 0.00803 -0.2918
## other sulfur dioxide 0.3480 0.2036 0.26700 0.5061
## pH sulphates alcohol quality
## fixed acidity -0.41579 -0.0308 -0.1285 -0.11477
## volatile acidity -0.00793 -0.0280 0.0761 -0.18847
## citric acid -0.16709 0.0668 -0.0726 -0.01197
## residual sugar -0.20765 -0.0222 -0.4525 -0.07746
## chlorides -0.08537 0.0283 -0.3611 -0.19657
## free sulfur dioxide -0.00541 0.0582 -0.2546 0.00803
## density -0.10634 0.0672 -0.7759 -0.29182
## pH 1.00000 0.1531 0.1340 0.10272
## sulphates 0.15307 1.0000 -0.0052 0.06100
## alcohol 0.13397 -0.0052 1.0000 0.43317
## quality 0.10272 0.0610 0.4332 1.00000
## other sulfur dioxide 0.00217 0.1466 -0.4317 -0.21347
## other sulfur dioxide
## fixed acidity 0.14027
## volatile acidity 0.14884
## citric acid 0.10656
## residual sugar 0.34798
## chlorides 0.20355
## free sulfur dioxide 0.26700
## density 0.50610
## pH 0.00217
## sulphates 0.14662
## alcohol -0.43173
## quality -0.21347
## other sulfur dioxide 1.00000
corrplot.mixed(corrs_white, upper="ellipse", lower="number")